Given a Bank customer, build a neural network-based classifier that can determine whether they will leave or not in the next 6 months.
Businesses like banks that provide service have to worry about the problem of 'Churn' i.e. customers leaving and joining another service provider. It is important to understand which aspects of the service influence a customer's decision in this regard. Management can concentrate efforts on the improvement of service, keeping in mind these priorities.
The case study is from an open-source dataset from Kaggle. The dataset contains 10,000 sample points with 14 distinct features such as CustomerId, CreditScore, Geography, Gender, Age, Tenure, Balance, etc.
import statsmodels.api as statsmodel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot_2samples
from statsmodels.graphics.gofplots import qqplot
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objs as po
import plotly.offline as py
from sklearn import tree
#from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
df=pd.read_csv('bank.csv')
print(df.head())
df.shape
df.describe().transpose()
df.isnull().sum()
# Data type of the columns
df.dtypes
#Creating Profile Report for Analysis
#!pip install pandas_profiling
import pandas_profiling
df.profile_report()
# Loading the .csv
bank_data = df
# Eliminating unnecesary attributes
bank_data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)
# inplace=True to perform it over the dataset itself
bank_data.rename(columns = {'Gender':'IsMale'}, inplace=True)
bank_data.IsMale.loc[bank_data.IsMale == 'Female'] = 0
bank_data.IsMale.loc[bank_data.IsMale == 'Male'] = 1
# Separating numerical (to normalize) and categorical variables (one-hot encoding)
num_subset = bank_data.select_dtypes('number')
cat_subset = bank_data.select_dtypes('object')
# Obtain one-hote enconded features using pd.get_dummies
cat_subset = pd.get_dummies(cat_subset)
# We save a denormalized but organized version of the dataset. This will be useful for some figures.
denorm_bank_data = pd.concat([cat_subset, num_subset], axis=1)
# Normalizing numerical variables
maxvals = num_subset.astype(float).max() # Finds maximum value
numericalColumns = {'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary'}
for col in numericalColumns:
num_subset[str(col)] = num_subset[str(col)]/maxvals[col] # Divide each column (variable) by its maximum
bank_data = pd.concat([cat_subset, num_subset], axis=1) # Concatenate both subsets
# Printing dataset types
# uint8: unsigned integer 0-255 (natural), int64: Integer (-9223372036854775808 to 9223372036854775807) float64: Double precision float
print(bank_data.dtypes)
# Displaying a dataset head
display(bank_data.head())
# Defining X and Y
bankX = bank_data.iloc[:,:12]
bankY = bank_data.iloc[:,12:13]
X = bankX.values # numpy array (10000, 12)
Y = bankY.values # numpy array (10000, 1)
list_binary = [0,1,2,4,9,10]
list_normal = [3,5,6,7,8,11]
order = [0,1,2,4,9,10,8,6,5,3,7,11]
fig=plt.figure(figsize=(20,8))
for i in range(len(order)):
xi = denorm_bank_data.values[:,order[i]]
ax1 = fig.add_subplot(2,len(order)/2,i+1)
plt.title(list(bank_data)[order[i]],fontsize=16)
if i<6:
plt.hist(xi,2)
plt.xticks([0.25,0.75], np.arange(0, 2, 1))
else:
plt.hist(xi,100)
plt.suptitle('Distributions of the Variables',fontsize=30)
plt.show()
fig=plt.figure(figsize=(5,5))
# First, we will find how many customers each country has
Customers_France = bankX.Geography_France.sum()
Customers_Germany = bankX.Geography_Germany.sum()
Customers_Spain = bankX.Geography_Spain.sum()
# We label, color and plot our data
labels = ['Germany','Spain','France']
sizes = [Customers_Germany, Customers_Spain,Customers_France]
colors = ['lightcoral','gold', 'cadetblue']
plt.title('Nationality - Proportion', fontsize=20)
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140,textprops={'fontsize': 14})
plt.axis('equal')
plt.show()
testdf=df[df.Exited==0]
testdf.IsActiveMember.value_counts()
testdf=df[df.Exited==0]
bal0rows=testdf[testdf.Balance==0]
print(bal0rows.shape)
# cant go with this. need to impute these values.
bal0activerows=bal0rows[bal0rows.IsActiveMember==0]
print(bal0activerows.shape)
testdf['Balance'].hist(bins=6)
testdf['EstimatedSalary'].hist(bins=6)
testdf=df[df.Balance!=0]
testdf['balance_percent_estimatedSalary']= (testdf['Balance']/testdf['EstimatedSalary'])
testdf.head()
testdf.boxplot(column=['EstimatedSalary'])
#outier
testdf.boxplot(column=['EstimatedSalary'])
#outier
df['EstimatedSalary']=np.where(df['EstimatedSalary']<25000,25000,df['EstimatedSalary'])
df.boxplot(column=['EstimatedSalary'])
df.boxplot(column=['Balance'])
testdf.boxplot(column=['Balance'])
#dont change balance as actual figures not estimated.
#only change where balance =0 by
testdf.boxplot(column=['balance_percent_estimatedSalary'])
# 6 % values are outliers so we cap it to a lower bound = 5%
testdfbal=testdf
testdfbal['balance_percent_estimatedSalary']=np.where(testdfbal['balance_percent_estimatedSalary']>5,5,testdfbal['balance_percent_estimatedSalary'])
testdfbal.boxplot(column=['balance_percent_estimatedSalary'])
testdfbal['balance_percent_estimatedSalary'].mean()
fil1=df['Balance']==0
fil3=df['IsActiveMember']==1
fil2= df['Exited']==0
df.describe()
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches
sns.set_style("white")
colors = ["green","red"]
alphas = [0.1,1]
colors = np.array(colors)
alphas = np.array(alphas)
churn = [int(Y[k]) for k in range(len(Y))]
churn = np.array(churn)
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)
sequence_containing_x_vals = list(range(-1000, 1000))
sequence_containing_y_vals = list(range(-1000, 1000))
sequence_containing_z_vals = list(range(-1000, 1000))
ax.scatter(X[:,5:6], X[:,7:8], X[:,10:11],color=colors[churn], alpha=0.2)
ax.set_xlabel(list(bankX)[5],fontsize=16)
ax.set_ylabel(list(bankX)[7],fontsize=16)
ax.set_zlabel(list(bankX)[10],fontsize=16)
green_patch = mpatches.Patch(color='green', label='Still Customer')
red_patch = mpatches.Patch(color='red', label='Exited')
plt.legend(handles=[red_patch,green_patch],bbox_to_anchor=(1.2,0.35))
plt.title('Customer Churn',fontsize=22)
plt.show()
def ClusteringCustomers(k):
sns.set_style("white")
clusters = KMeans(n_clusters=k).fit(CorrX)
initial_labelled=clusters.labels_
########################################### Change of Labels #############################################
# Old Label Basis
basis_sum_original = clusters.cluster_centers_.sum(axis=1)
# New Label Basis
basis_sum_control = sorted(basis_sum_original)
# Label Basis Transformation
final_labelled = [basis_sum_control.index(k) for k in basis_sum_original[initial_labelled]]
##########################################################################################################
# Plotting histogram of number of customers per cluster
plt.figure()
plt.title('Number of Customers per Cluster', fontsize=20)
plt.hist(final_labelled,bins=range(k+1))
plt.xticks(np.arange(0.5,k), np.arange(1, k+1, 1))
plt.show()
# Plotting actual 3D plot of the customers
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)
sequence_containing_x_vals = list(range(-1000, 1000))
sequence_containing_y_vals = list(range(-1000, 1000))
sequence_containing_z_vals = list(range(-1000, 1000))
colors = np.array(["#033500", "#840000", "blue", "orange", 'purple', "#363737"])
ax.scatter(CorrX[:,0], CorrX[:,1], CorrX[:,2],color=colors[final_labelled],s=20, depthshade=True, alpha=0.2)
plt.title('Clusters of Customers - k = '+str(k),fontsize=20)
ax.set_xlabel(list(dfCorrX)[0],fontsize=16)
ax.set_ylabel(list(dfCorrX)[1],fontsize=16)
ax.set_zlabel(list(dfCorrX)[2],fontsize=16)
patches = []
for i in range(k):
patch = mpatches.Patch(color=colors[i], label='Cluster '+str(i))
patches.append(patch)
ax.legend(handles=patches,bbox_to_anchor=(1.15,0.35))
plt.show()
# Now we will predict the risk of each cluster. However, recall that in total 7963 of them remained and only 2037 exited
# We will consider a Risk=1 (maximum) if the 2037 cases of exited clients occur in the same cluster.
for p in range(k):
ClusterExited = int(bankY[[i==p for i in final_labelled]].sum())
TotalExited = int(bankY.sum())
ClusterRisk = ClusterExited / TotalExited
print('The risk ratio of Cluster',p,'is',round(ClusterRisk*100,1),'%')
return
dfCorrX = bankX.reindex(['Age', 'Balance','IsActiveMember'], axis=1)
CorrX = dfCorrX.values
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from matplotlib.ticker import MaxNLocator
ClusteringCustomers(k=4)
#means no missing values, has 10000 values for every column,
# we are diving the data into training set and test set the ration as 70/30 format
train, test = train_test_split(df, test_size=0.3)
# to show there are no missing values
df.isnull().sum()
# visulaize the dimension of training data
df.shape
# check the unique values of each attribute in the training data
df.nunique()
# LET'S TRY TO VISUALIZE THE FEATURES ONE BY ONE
#No of Exited vs Active get the percentage split figure
ExitedValues = df.Exited.value_counts()
labels = ["Loyal Customer","Churn Customers"]
colors = ['green', 'red']
fig1, f1 = plt.subplots()
f1.pie(ExitedValues,labels=labels, colors = colors, autopct='%1.1f%%',shadow=True, startangle=60)
f1.axis('equal')
plt.tight_layout()
plt.show()
#Implies dataset is imbalanced
#In the bankdata we have biased distribution i.e is 80% custoemrs are loyal and 20% are not so we need to make the data set balanced
plt.hist(df['CreditScore'])
plt.xlabel('Credit Score Distribution')
plt.ylabel('Num of Customers')
plt.title('CreditScore')
plt.hist(df['Balance'])
plt.xlabel('Account Balance Distribution')
plt.ylabel('Num of Customers')
plt.title('Balance')
df['Balance']=np.where(df['Balance']==0,(df['EstimatedSalary']*1.7),df['Balance'])
Geosplit = train.Geography.value_counts()
Geovalues = df['Geography'].value_counts().values.tolist()
Geolabels = df['Geography'].value_counts().keys().tolist()
colors = ['#66b3ff', '#ff9999' , '#ffcc99']
fig2, f2 = plt.subplots()
f2.pie(Geovalues,labels=Geolabels, colors = colors, autopct='%1.1f%%',shadow=True, startangle=90)
# Equal aspect ratio ensures that pie is drawn as a circle
f2.axis('equal')
plt.tight_layout()
plt.title('Percentage split based on Geography')
plt.show()
sns.boxplot(data= df['Tenure'], orient="v")
HasCardvalues = train['HasCrCard'].value_counts().values.tolist()
HasCardlabels = ["Having Card" , "No Card"]
colors = ['#99ff99','#ffcc99']
fig5, f5 = plt.subplots()
f5.pie(HasCardvalues ,labels=HasCardlabels, colors = colors,autopct='%1.1f%%',shadow=True, startangle=60)
f5.axis('equal')
plt.title('Percentage split based on Card Possession')
plt.tight_layout()
plt.show()
sns.boxplot(df['Age'] , orient = "v")
df[df.columns].corr()
# generate the heatmap with the above data so that we can see what colums are corelated with the other one graphically
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.set(font_scale = 0.75)
sns.heatmap(df.corr(), annot = True, fmt = ".6f")
plt.show()
There is no significant correlation among columns
churn = df[df["Exited"] == 1]
not_churn = df[df["Exited"] == 0]
target_column = ["Exited"]
cat_columns = df.nunique()[df.nunique() < 5].keys().tolist()
cat_columns = [x for x in cat_columns if x not in target_column]
num_columns = [x for x in df.columns if x not in target_column + cat_columns ]
def plot_visualization(column) :
first = go.Pie(labels = churn[column].value_counts().keys().tolist(),
values = churn[column].value_counts().values.tolist(),
hoverinfo = "label+percent+name",
domain = dict(x = [0,.48]),
name = "Churn Customers",
marker = dict(line = dict(width = 2,color = "rgb(243,243,243)")),
hole = .7
)
second = go.Pie(values = not_churn[column].value_counts().values.tolist(),
labels = not_churn[column].value_counts().keys().tolist(),
hoverinfo = "label+percent+name",
marker = dict(line = dict(width = 2,color = "rgb(243,243,243)")),
domain = dict(x = [.52,1]),
hole = .7,
name = "Non churn customers")
layout_details = go.Layout(dict(title = column + " distribution in customer attrition ", paper_bgcolor = "rgb(243,243,243)",
annotations = [dict(text = "churn customers",font = dict(size = 13),showarrow = False,x = .15, y = .5),
dict(text = "Non churn customers",font = dict(size = 13),showarrow = False,x = .88,y = .5)]
))
data = [first,second]
fig = go.Figure(data = data,layout = layout_details)
py.iplot(fig)
# Calling the function for plotting the pie plot for geography column
plot_visualization(cat_columns[0])
def hist_visulalization(column) :
first = go.Histogram(x = churn[column],histnorm= "percent",name = "Churn Customers",
marker = dict(line = dict(width = .5,color = "black")),opacity = .9 )
second = go.Histogram(x = not_churn[column],histnorm = "percent",name = "Non churn customers",
marker = dict(line = dict(width = .5,color = "black")),opacity = .9)
data = [first,second]
layout_details = go.Layout(dict(title =column + " distribution in customer attrition ",
paper_bgcolor = "rgb(243,243,243)",
plot_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(256, 256, 256)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
))
fig = go.Figure(data=data,layout=layout_details)
py.iplot(fig)
hist_visulalization(num_columns[1])
# as we deal with elder customers (in terms of age) their is a risk of losing them,
#plotting the pie plot for gender column
plot_visualization(cat_columns[1])
# Calling the function for plotting the histogram for tenure column
hist_visulalization(num_columns[2])
# plotting the pie plot for gender column
# 70% of the people who leave, do not
plot_visualization(cat_columns[3])
# Graphical representation of the target label percentage before upsampling
total_len = len(df['Exited'])
sns.set()
sns.countplot(df.Exited).set_title('Data Distribution')
ax = plt.gca()
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x() + p.get_width()/2.,
height + 2,
'{:.2f}%'.format(100 * (height/total_len)),
fontsize=14, ha='center', va='bottom')
sns.set(font_scale=1.5)
ax.set_xlabel("Labels for exited column")
ax.set_ylabel("Numbers of records")
plt.show()
df['Exited'].value_counts()
from sklearn.utils import resample
#upsampling minority class to match to majority class
df_majority = df[df.Exited==0]
df_minority = df[df.Exited==1]
df_minority_upsampled = resample(df_minority, replace=True, # sample with replacement
n_samples=7963, # to match majority class
random_state=123) # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.Exited.value_counts()
# Graphical representation of the target label percentage after balancing minority class
total_len = len(df_upsampled['Exited'])
sns.set()
sns.countplot(df_upsampled.Exited).set_title('Data Distribution')
ax = plt.gca()
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x() + p.get_width()/2.,
height + 2,
'{:.2f}%'.format(100 * (height/total_len)),
fontsize=14, ha='center', va='bottom')
sns.set(font_scale=1.5)
ax.set_xlabel("Labels for exited column")
ax.set_ylabel("Numbers of records")
plt.show()
df_new = df_upsampled
df_new.dtypes
df_new=pd.get_dummies(df_new,drop_first=True)
df_new.head()
df_new.columns
df_ex_ai=df_new
scale_down_column=pd.DataFrame(df_new[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary']])
for col in scale_down_column.columns:
scale_down_column[col]=scale_down_column[col].astype('float64')
scale_down_column.dtypes
type(scale_down_column)
from sklearn.preprocessing import MinMaxScaler
scalar=MinMaxScaler()
scalar.fit(scale_down_column)
scale_down_column1=scalar.transform(scale_down_column)
scaled_df = pd.DataFrame(scale_down_column1, columns=scale_down_column.columns)
scaled_df.head()
scaled_df.shape
scaled_df=scaled_df.reset_index(drop=True)
scaled_df.head()
df = pd.read_csv('bank.csv')
x = df.iloc[:,3:-1].values
print(x)
y = df.iloc[:,-1].values
print(y)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
x[:, 2] = le.fit_transform(x[:, 2])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
x = np.array(ct.fit_transform(x))
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann.fit(x_train, y_train, batch_size = 32, epochs = 100)
y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
import pandas as pd
import seaborn as sn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
conf_matrix = confusion_matrix(y_test.astype('int'), y_pred.astype('int'))
sn.heatmap(conf_matrix,annot=True,fmt = 'd',square = True,
xticklabels=['not churn','churn'],
yticklabels=['not churn','churn'],
linewidths = 2,linecolor = 'w',cmap = 'Set1')
plt.subplots_adjust(wspace = .3,hspace = .3)
df = pd.read_csv('bank.csv')
# Dropping unique columns not impacting the outcome
df.drop(['RowNumber','CustomerId', 'Surname'], axis = 1, inplace = True)
df_new = pd.get_dummies(df, columns=['Geography'],drop_first = True)
df_new.head()
# Covert Gender column values to 1 or 0 for male and female
mf = lambda x : 1 if x == 'Male' else 0
bank_churn_mod1_df = df_new.copy(deep = True)
bank_churn_mod1_df['Gender'] = df_new['Gender'].apply(mf)
bank_churn_mod1_df.head(10)
# Split the data into Train and Test sets
from sklearn.model_selection import train_test_split
X = bank_churn_mod1_df.drop('Exited', axis=1)
y = bank_churn_mod1_df.Exited
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
print(y_test)
bank_churn_mod1_df['Exited'].value_counts(normalize=True)
y_train.value_counts(normalize=True)
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
scaler = StandardScaler()
Xtrain_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
Xtest_scaled = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import math
from tensorflow.keras import optimizers
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn import metrics
from tensorflow.keras import Input
def draw_cm(ytest, predicty):
cm=metrics.confusion_matrix(ytest, predicty, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
print(df_cm.T)
plt.figure(figsize = (7,5))
sns.heatmap(df_cm.T, annot=True, fmt ='d')
print( "RECALL SCORE = %f" % recall_score(y_test,predicty))
print( "PRECISION SCORE = %f" % precision_score(y_test,predicty))
return(df_cm)
def print_conf_summary(df_cm) :
print("The confusion matrix")
print("True Positives (TP): we correctly predicted that customers will leave : {0}".format(df_cm["Predict 1"][0]))
print("True Negatives (TN): we correctly predicted that customers will not leave : {0}".format(df_cm["Predict 0"][1]))
print("False Positives (FP): Predicted they will leave but did not leave : {0}".format(df_cm["Predict 1"][1]))
print("False Negatives (FN): Predicted they will not leave but left : {0}".format(df_cm["Predict 0"][0]))
def create_and_run_model(Xtr, ytr, Xte, yte, hidden_layers = 2) :
global batch_size_no
global epoch_no
global optim
ncols = Xtrain_scaled.shape[1]
model = Sequential()
hid_activation = 'relu'
final_activation = 'sigmoid'
model.add(Input(shape = (ncols,)))
if (hidden_layers >=1) :
# Add the first hidden layer
hl_neurons = math.floor(ncols * 0.90)
print(hl_neurons)
model.add(Dense( hl_neurons, activation=hid_activation))
if (hidden_layers >=2 ) :
# second hidden layer
hl_neurons = math.floor(hl_neurons * 0.70)
print(hl_neurons)
model.add(Dense(hl_neurons, activation=hid_activation))
if (hidden_layers >=3 ) :
# third hidden layer
hl_neurons = math.floor(hl_neurons * 0.70)
print(hl_neurons)
model.add(Dense(hl_neurons, activation=hid_activation))
# Add an output layer with one neuron
model.add(Dense(1, activation = final_activation))
#optim = optimizers.Adam(lr = 0.001)
model.compile(optimizer = optim, loss = 'binary_crossentropy', metrics=['accuracy']) #metrics = accuracy / mse, optimizer = /
model.fit(Xtr, ytr.values, batch_size = batch_size_no, epochs = epoch_no, verbose = 0)
results = model.evaluate(Xte, yte.values, verbose=0)
print(model.metrics_names)
print(results)
predicted_y = model.predict(Xte)
predicted_y = predicted_y > 0.5
print(model.summary())
df_cm = draw_cm(yte,predicted_y)
print_conf_summary(df_cm)
return (model, predicted_y)
batch_size_no = 5
epoch_no = 10
np.random.seed(100)
optim = optimizers.Adam(lr = 0.001)
optimName = 'Adam'
hl = 1
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report =reportN
report.drop_duplicates(inplace=True)
report
batch_size_no = 5 epoch_no = 10 np.random.seed(100) optim = optimizers.SGD(lr = 0.001) optimName = 'SGD' hl = 1 model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl) reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0], model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict), precision_score(y_test,ypredict)]], columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision']) report = report.append(reportN, ignore_index=True) report.drop_duplicates(inplace=True) report
batch_size_no = 5
epoch_no = 20
np.random.seed(100)
optim = optimizers.Adam(lr = 0.001)
optimName = 'Adam'
hl = 1
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report = report.append(reportN, ignore_index=True)
report.drop_duplicates(inplace=True)
report
batch_size_no = 5
epoch_no = 20
np.random.seed(100)
optim = optimizers.Adam(lr = 0.001)
optimName = 'Adam'
hl = 2
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report = report.append(reportN, ignore_index=True)
report.drop_duplicates(inplace=True)
report
batch_size_no = 5
epoch_no = 20
np.random.seed(100)
optim = optimizers.Adam(lr = 0.001)
optimName = 'Adam'
hl = 3
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report = report.append(reportN, ignore_index=True)
report.drop_duplicates(inplace=True)
report
batch_size_no = 10
epoch_no = 500
np.random.seed(100)
optim = optimizers.Adam(lr = 0.001)
optimName = 'Adam'
hl = 4
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report = report.append(reportN, ignore_index=True)
report.drop_duplicates(inplace=True)
report
batch_size_no = 5
epoch_no = 10
np.random.seed(100)
optim = optimizers.SGD(lr = 0.001)
optimName = 'SGD'
hl = 1
model, ypredict = create_and_run_model(Xtrain_scaled, y_train, Xtest_scaled, y_test,hidden_layers = hl)
reportN = pd.DataFrame([[optimName, hl,epoch_no,batch_size_no,model.evaluate(Xtest_scaled, y_test.values,verbose=0)[0],
model.evaluate(Xtest_scaled, y_test.values,verbose=0)[1],recall_score(y_test,ypredict),
precision_score(y_test,ypredict)]],
columns=['Optimizer', 'HiddenLayers','Epochs', 'BatchSize','Loss', 'Accuracy', 'Recall', 'Precision'])
report = report.append(reportN, ignore_index=True)
report.drop_duplicates(inplace=True)
report
As RecallScore less than 50 % ,it means data is imbalanced and it requires more data for modeling.We can generate more data with oversamping and try with model creation